knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(matrixStats)
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
library(ggrepel)

Data prep

##########################################################################################
## LSHTM: Maria’s data that she used for the hvCpG paper can be accessed from ing-p5 here:
# /mnt/old_user_accounts/p3/maria/PhD/Data/datasets/
#   
# Her hvCpG scripts are here:
# /mnt/old_user_accounts/p3/maria/PhD/Projects/hvCpGs/Scripts/

source("/home/alice/2024_hvCpG/03_prepDatasetsMaria/dataprep_MariaArrays.R")
my_list_mat <- Maria_filtered_list_mat; rm(Maria_filtered_list_mat)
cpgnames <- unique(unlist(sapply(my_list_mat, row.names)))
cpgnames <- cpgnames[order(cpgnames)]

Reproduce Maria’s results

Maria’s paper: We defined an hvCpG in the following way:

  1. in 65% of datasets in which the CpG is covered (following quality control), it has methylation variance in the top 5% of all (non-removed) CpGs.
  2. is covered in at least 15 of the 30 datasets.
## Within each dataset, calculate the CpGs variance, and keep the top 5%
top5pcvar <- lapply(my_list_mat, function(mat) {
  # Step 1: Calculate row variances
  row_variances = rowVars(mat, na.rm = T)
  df = data.frame(var=row_variances, cpg=names(row_variances))
  # Step 2
  top = top_n(df, as.integer(0.05*nrow(df)), var) ## slightly different than quantile cause keep ties
  return(top)
})

## CpGs in the top 5% variance:
cpg_counts_top <- table(unlist(lapply(top5pcvar, rownames))) %>%
  data.frame() %>% dplyr::rename("cpgs"="Var1", "all_cpgs_top5pc"="Freq")

## all covered CpGs:
cpg_counts <- table(unlist(lapply(my_list_mat, rownames))) %>%
  data.frame() %>% dplyr::rename("cpgs"="Var1", "all_cpgs"="Freq")

## Both:
cpg_counts_full <- merge(cpg_counts, cpg_counts_top, all.x = T)
rm(cpg_counts, cpg_counts_top)

## in 65% of datasets in which the CpG is covered (following quality control), it has methylation variance in the top 5% of all (non-removed) CpGs:
hvCpGs_maria <- na.omit(cpg_counts_full[cpg_counts_full$all_cpgs_top5pc/cpg_counts_full$all_cpgs >= 0.65,])

## rounding, to mimick Maria's approach
hvCpGs_maria <- na.omit(cpg_counts_full[round(cpg_counts_full$all_cpgs_top5pc/cpg_counts_full$all_cpgs, 2) >= 0.65,])

Maria detected 4143 hvCpGs. I detected 4377 hvCpGs. We both have 4143 hvCpGs in common.

Identify hvCpGs based on a sd multiplicative factor lambda

The proportion of CpGs than Maria found hvCpGs is p(hvCpG) = 1.02%.

Calculate lambda

Within each dataset k, calculate the median sd of all CpG j

all_sd_jk <- sapply(my_list_mat, function(k){
  
  ## Scale
  k = log2(k/(1-k))
  
  get_sd_k <- function(k){
    ## Calculate a vector of the row (=per CpG j) sd
    return(rowSds(k, na.rm = T))
  }
  ## Return a vector of sds, in a list for each dataset 
  return(get_sd_k(k))
})

## Plot:
df_long <- reshape2::melt(all_sd_jk, variable.name = "Vector", value.name = "SDs")

## Plot distributions of all vectors on the same graph
ggplot(df_long, aes(x = SDs, color = L1)) +
  geom_density() +
  labs(title = "Distribution of SDs accross datasets",
       x = "SDs",
       y = "Density") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_x_sqrt()
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_density()`).

## Calculate lambda per dataset
lambdas = sapply(all_sd_jk, function(x){quantile(x, 0.95, na.rm=T)/median(x, na.rm=T)})
names(lambdas) <- gsub(".95%", "", names(lambdas))
# Histogram with kernel density
ggplot(data.frame(lambda=lambdas, dataset=names(lambdas)),
       aes(x = lambdas)) +
  geom_histogram(aes(y = after_stat(density)),
                 colour = 1, fill = "white", binwidth = .1) +
  geom_density(lwd = 1, colour = 4,
               fill = 4, alpha = 0.25)+
  theme_minimal() +
  geom_vline(xintercept = median(lambdas), col = "red")

median(lambdas) #1.878303
## [1] 1.878303

Which datasets have a higher lambda, and why?

df = data.frame(nind=sapply(my_list_mat, ncol),
                dataset=names(my_list_mat),
                lambda=lambdas, 
                tissue = sapply(strsplit(names(my_list_mat), "_"), `[`, 1),
                ethnicity=sapply(strsplit(names(my_list_mat), "_"), `[`, 2)) %>%
  mutate(dataset=ifelse(dataset %in% c("Blood_Cauc", "Blood_Hisp"), "Blood_Cauc_Hisp", dataset)) %>% 
  mutate(dataset=ifelse(dataset %in% c("Blood_Mexican", "Blood_PuertoRican "), "Blood_Mex_PuertoRican ", dataset)) %>% 
  mutate(dataset=ifelse(dataset %in% c("CD4+_Estonian", "CD8+_Estonian"), "CD4+_CD8+_Estonian", dataset)) %>% 
  mutate(dataset=ifelse(dataset %in% c("Saliva_Hisp", "Saliva_Cauc"), "Saliva_Hisp_Cauc", dataset))

ggplot(data = df,
       aes(x=lambda, y=nind))+
  geom_smooth(method = "lm")+
  geom_point()+
  geom_label_repel(aes(label = dataset, fill=dataset), size= 2, alpha=.8, max.overlaps = 25)+
  theme_bw()+theme(legend.position = "none")+
  scale_x_log10()
## `geom_smooth()` using formula = 'y ~ x'

## Emphasize the outliers
df_long$col = "grey"
df_long$col[df_long$L1 %in% "BulkFrontalCortex"] <- "red"

ggplot(df_long, aes(x = SDs, group = L1, fill = col)) +
  geom_density(data = df_long[!df_long$L1 %in% "BulkFrontalCortex",], alpha = .5) +
  geom_density(data = df_long[df_long$L1 %in% "BulkFrontalCortex",], alpha = .6) +
  scale_fill_manual(values = c("grey", "red"))+
  labs(title = "Distribution of SDs accross datasets",subtitle = "Red=BulkFrontalCortex",
       x = "SDs",
       y = "Density") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_x_sqrt()
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

## Emphasize the outliers
df_long$col = "grey"
df_long$col[df_long$L1 %in% "Blood_Cauc"] <- "red"

ggplot(df_long, aes(x = SDs, group = L1, fill = col)) +
  geom_density(data = df_long[!df_long$L1 %in% "Blood_Cauc",], alpha = .5) +
  geom_density(data = df_long[df_long$L1 %in% "Blood_Cauc",], alpha = .6) +
  scale_fill_manual(values = c("grey", "red"))+
  labs(title = "Distribution of SDs accross datasets",subtitle = "Red=Blood_Cauc",
       x = "SDs",
       y = "Density") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_x_sqrt()
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_density()`).

## Emphasize the outlier
df_long$col = "grey"
df_long$col[df_long$L1 %in% "Blood_Hisp"] <- "red"

ggplot(df_long, aes(x = SDs, group = L1, fill = col)) +
  geom_density(data = df_long[!df_long$L1 %in% "Blood_Hisp",], alpha = .5) +
  geom_density(data = df_long[df_long$L1 %in% "Blood_Hisp",], alpha = .6) +
  scale_fill_manual(values = c("grey", "red"))+
  labs(title = "Distribution of SDs accross datasets",subtitle = "Red=Blood_Hisp",
       x = "SDs",
       y = "Density") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_x_sqrt()
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 8 rows containing non-finite outside the scale range
## (`stat_density()`).

Maximum likelihood analysis

The equation is:

\[ \log\left(P(M_j)\right) = \sum_{i=1}^{n} \log\left( \sum_{Z_j=0}^{1} \left( \sum_{Z_{j,k}=0}^{1} P(M_{i,j} \mid Z_{j,k}) \times P(Z_{j,k} \mid Z_j) \right) \times P(Z_j) \right) \]

Examine weird values

The transformation lead to weird values:

# my_list_mat$Blood_Hisp[rownames(my_list_mat$Blood_Hisp) %in% "cg23089912",1:10]
# scaled_list_mat$Blood_Hisp[rownames(scaled_list_mat$Blood_Hisp) %in% "cg23089912",1:10]
# 
# test <- readRDS("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/GEO/Raw_cleaned_beta_matrices_GEO/Blood_Hisp")
# test[rownames(test) %in% "cg23089912",5:10]
# 
# test <- readRDS("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/GEO/BMIQ + 10 PCs + age + sex OUTLIERS REMOVED/Blood_Hisp.RDS")
# test[rownames(test) %in% "cg23089912",5:10]
# # GSM1870986
# # 0.00000000000000001124546 --> give extreme value in scaling, and p dnorm=0

The zero in 8h sample was weirdly transformed. The cleaned values are very different from the original. Is that expected?

Optimisation over multiple CpGs

Run on LSHTM server (need to be outside of RStudio): source(“run_hvCpGdetection.R”) ## for different parameters

“Nelder-Mead” or “L-BFGS-B” optimisation method?

load("/home/alice/2024_hvCpG/05_hvCpGalgorithm/resultsDir/results_NM_test3000_0.95.RDA")
results_NM_test3000_0.95 <- result %>% data.frame %>% tibble::rownames_to_column("CpG") %>% 
  dplyr::rename(alpha_NM_0.95=".")

load("/home/alice/2024_hvCpG/05_hvCpGalgorithm/resultsDir/results_LB_test3000_0.95.RDA")
results_LB_test3000_0.95 <- result %>% data.frame %>% tibble::rownames_to_column("CpG") %>% 
  dplyr::rename(alpha_LB_0.95="alpha")

merge(results_NM_test3000_0.95, results_LB_test3000_0.95, by = "CpG") -> results_test3000_0.95
results_test3000_0.95 %>% mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG, 
                                                           "1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>% ggplot(aes(x=alpha_LB_0.95, y=alpha_NM_0.95, fill = Previous.results)) +
  geom_abline(slope = 1) +
  geom_point(pch=21, size = 3, alpha = .8) +
  theme_bw()

Some CpGs have an alpha of 0 for LBB algorithm and different alpha for NM

results_test3000_0.95[round(results_test3000_0.95$alpha_NM_0.95, 1) != round(results_test3000_0.95$alpha_LB_0.95, 1),] %>% mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG, "1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>% melt() %>%
  ggplot(aes(x=CpG, y=value, fill = variable)) +
  geom_point(pch=21, size = 3, alpha = .8) +
  facet_grid(Previous.results~.) + theme_bw() + theme(axis.text.x = element_blank())
## Using CpG, Previous.results as id variables

The probability of zero seems to be an artefact of the “L-BFGS-B” optimisation method, so we choose “Nelder-Mead”.

Test different p0

load("/home/alice/2024_hvCpG/05_hvCpGalgorithm/resultsDir/results_NM_test3000_0.80.RDA")
results_NM_test3000_0.80 <- result %>% data.frame %>% tibble::rownames_to_column("CpG") %>% 
  dplyr::rename(alpha_NM_0.80=".")

load("/home/alice/2024_hvCpG/05_hvCpGalgorithm/resultsDir/results_NM_test3000_0.99.RDA")
results_NM_test3000_0.99 <- result %>% data.frame %>% tibble::rownames_to_column("CpG") %>% 
  dplyr::rename(alpha_NM_0.99=".")

merge(merge(results_NM_test3000_0.80, results_NM_test3000_0.99), results_NM_test3000_0.95) %>%
  mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG, 
                                   "1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>% 
  melt() %>%
  mutate(p0 = sub(".*_", "", variable)) %>%
  ggplot(aes(x=CpG, y=value, fill = Previous.results)) +
  geom_point(pch=21, size = 2, alpha = .5) + 
  scale_fill_manual(values = c("pink","black")) +
  ylab("alpha (probability of being a hvCpG)") +
  facet_grid(.~p0) + theme_bw() + theme(axis.text.x = element_blank()) 
## Using CpG, Previous.results as id variables

merge(merge(results_NM_test3000_0.80, results_NM_test3000_0.99), results_NM_test3000_0.95) %>%
  mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG, 
                                   "1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>% 
  melt() %>%
  mutate(p0 = sub(".*_", "", variable)) %>%
  ggplot(aes(x = factor(p0), y = value, fill = Previous.results)) +
  geom_point(
    pch = 21, alpha = .2,
    position = position_jitterdodge(
      jitter.width = 0.2,  # Horizontal jitter amount
      dodge.width = 0.8     # Must match boxplot dodge width
    )
  ) +
  geom_boxplot(position = position_dodge(width = 0.8), alpha = .5) +
  scale_fill_manual(values = c("pink", "grey")) +
  xlab("p0 (probability of true negative)") +
  ylab("alpha (probability of being a hvCpG)") +
  theme_bw()
## Using CpG, Previous.results as id variables

P0=0.95 sounds fine, not much improvement by taking 0.99

What are the hvCpGs of Maria that I found with a very low probability of being a hvCpG with my approach?

source("../03_prepDatasetsMaria/dataprep_MariaArrays.R")

getplotfewCpG <-function(myvec, title){
  listToTest <- lapply(Maria_filtered_list_mat, function(df) df[row.names(df) %in% myvec, , drop = FALSE])
  # Combine the list into one data frame
  mylist <- lapply(lapply(listToTest, as.data.frame), tibble::rownames_to_column, var = "CpG")
  # Combine all into one long data frame
  long_df <- bind_rows(mylist, .id = "Dataset")
  # Pivot to long format: one row per CpG/sample/tissue
  df_long <- na.omit(long_df %>%
                       tidyr::pivot_longer(
                         cols = -c(Dataset, CpG),
                         names_to = "Sample",
                         values_to = "Value"
                       ))
  
  # Plot with color by source data frame
  ggplot(df_long, aes(x = Dataset, y = Value, color = Dataset)) +
    geom_point() +
    theme_bw() + theme(legend.position = "none") + facet_grid(.~CpG) +
    ggtitle(title)+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 5))  
}

myvec1 <- results_NM_test3000_0.95 %>%
  mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG, 
                                   "1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>% 
  arrange(., alpha_NM_0.95) %>%
  filter(Previous.results %in% "1500 hvCpG in Maria's study") %>% head(n=3) %>% select(CpG) %>%
  unlist()
getplotfewCpG(myvec1, "Maria's hvCpG with the 3 lowest probabilities to be one in Alice's")

myvec2 <- results_NM_test3000_0.95 %>%
  mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG, 
                                   "1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>% 
  arrange(., desc(alpha_NM_0.95)) %>%
  filter(Previous.results %in% "1500 hvCpG in Maria's study") %>% head(n=3) %>% select(CpG) %>%
  unlist()
getplotfewCpG(myvec2, "Maria's hvCpG with the 3 highest probabilities to be one in Alice's")

myvec3 <- results_NM_test3000_0.95 %>%
  mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG, 
                                   "1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>% 
  arrange(., desc(alpha_NM_0.95)) %>%
  filter(!Previous.results %in% "1500 hvCpG in Maria's study") %>% head(n=3) %>% select(CpG) %>%
  unlist()
getplotfewCpG(myvec3, "CpG not detected by Maria as hvCpG, with the 3 highest probabilities to be one in Alice's")

myvec4 <- results_NM_test3000_0.95 %>%
  mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG, 
                                   "1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>% 
  arrange(., alpha_NM_0.95) %>%
  filter(!Previous.results %in% "1500 hvCpG in Maria's study") %>% head(n=3) %>% select(CpG) %>%
  unlist()
getplotfewCpG(myvec4, "CpG not detected by Maria as hvCpG, with the 3 lowest probabilities to be one in Alice's")

Notes:

lambdas are defined based on a 5% threshold, even if alpha is not. How to not hardcode them? MCMC on lambda? Link with alpha?

The analysis is complete.